#Librairies utilisées:
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats
import numpy as np
import seaborn as sns
import plotly
import plotly.graph_objects as go
import chart_studio
import chart_studio.plotly as py
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot
plotly.offline.init_notebook_mode()
import plotly.express as px
from pySankey.sankey import sankey
init_notebook_mode(connected=True) # initiate notebook for offline plot
from plotly.subplots import make_subplots
import plotly.io as pio
#pio.renderers.default = 'svg'
pio.renderers.default = 'browser'
import plotly.offline as pyo
pyo.init_notebook_mode()
from IPython.display import Image, HTML, display, SVG
import missingno as msno
from pywaffle import Waffle
display(HTML("<style>.container { width:100% !important; }</style>"))
#Données brutes
data= pd.read_excel(r'~/8e49d734-bd63-432d-814e-6c4599f33f04.xlsx')
data
| AGE | SEXE | TDT | PAR | CHOLESTEROL | GAJ | ECG | FCMAX | ANGINE | DEPRESSION | PENTE | CŒUR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40 | homme | AA | 140 | 289 | 0 | Normal | 172 | Non | 0.0 | Ascendant | 0 |
| 1 | 49 | femme | DNA | 160 | 180 | 0 | Normal | 156 | Non | 1.0 | Plat | 1 |
| 2 | 37 | homme | AA | 130 | 283 | 0 | ST | 98 | Non | 0.0 | Ascendant | 0 |
| 3 | 48 | femme | ASY | 138 | 214 | 0 | Normal | 108 | Oui | 1.5 | Plat | 1 |
| 4 | 54 | homme | DNA | 150 | 195 | 0 | Normal | 122 | Non | 0.0 | Ascendant | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 913 | 45 | homme | AT | 110 | 264 | 0 | Normal | 132 | Non | 1.2 | Plat | 1 |
| 914 | 68 | homme | ASY | 144 | 193 | 1 | Normal | 141 | Non | 3.4 | Plat | 1 |
| 915 | 57 | homme | ASY | 130 | 131 | 0 | Normal | 115 | Oui | 1.2 | Plat | 1 |
| 916 | 57 | femme | AA | 130 | 236 | 0 | LVH | 174 | Non | 0.0 | Plat | 1 |
| 917 | 38 | homme | DNA | 138 | 175 | 0 | Normal | 173 | Non | 0.0 | Ascendant | 0 |
918 rows × 12 columns
# Analyse statistique globale
desc=data.describe(include='all')
desc
| AGE | SEXE | TDT | PAR | CHOLESTEROL | GAJ | ECG | FCMAX | ANGINE | DEPRESSION | PENTE | CŒUR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 918.000000 | 918 | 918 | 918.000000 | 918.000000 | 918.000000 | 918 | 918.000000 | 918 | 918.000000 | 918 | 918.000000 |
| unique | NaN | 2 | 4 | NaN | NaN | NaN | 3 | NaN | 2 | NaN | 3 | NaN |
| top | NaN | homme | ASY | NaN | NaN | NaN | Normal | NaN | Non | NaN | Plat | NaN |
| freq | NaN | 725 | 496 | NaN | NaN | NaN | 552 | NaN | 547 | NaN | 460 | NaN |
| mean | 53.510893 | NaN | NaN | 132.396514 | 198.799564 | 0.233115 | NaN | 136.809368 | NaN | 0.887364 | NaN | 0.553377 |
| std | 9.432617 | NaN | NaN | 18.514154 | 109.384145 | 0.423046 | NaN | 25.460334 | NaN | 1.066570 | NaN | 0.497414 |
| min | 28.000000 | NaN | NaN | 0.000000 | 0.000000 | 0.000000 | NaN | 60.000000 | NaN | -2.600000 | NaN | 0.000000 |
| 25% | 47.000000 | NaN | NaN | 120.000000 | 173.250000 | 0.000000 | NaN | 120.000000 | NaN | 0.000000 | NaN | 0.000000 |
| 50% | 54.000000 | NaN | NaN | 130.000000 | 223.000000 | 0.000000 | NaN | 138.000000 | NaN | 0.600000 | NaN | 1.000000 |
| 75% | 60.000000 | NaN | NaN | 140.000000 | 267.000000 | 0.000000 | NaN | 156.000000 | NaN | 1.500000 | NaN | 1.000000 |
| max | 77.000000 | NaN | NaN | 200.000000 | 603.000000 | 1.000000 | NaN | 202.000000 | NaN | 6.200000 | NaN | 1.000000 |
# Analyse des données qualitatives : les catégories
for col in data.select_dtypes('object'):
print(f'{col :-<30}, {data[col].unique()}')
SEXE--------------------------, ['homme' 'femme'] TDT---------------------------, ['AA' 'DNA' 'ASY' 'AT'] ECG---------------------------, ['Normal' 'ST' 'LVH'] ANGINE------------------------, ['Non' 'Oui'] PENTE-------------------------, ['Ascendant' 'Plat' 'Descendant']
#Information sur le type des données
info=data.info()
info
<class 'pandas.core.frame.DataFrame'> RangeIndex: 918 entries, 0 to 917 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 AGE 918 non-null int64 1 SEXE 918 non-null object 2 TDT 918 non-null object 3 PAR 918 non-null int64 4 CHOLESTEROL 918 non-null int64 5 GAJ 918 non-null int64 6 ECG 918 non-null object 7 FCMAX 918 non-null int64 8 ANGINE 918 non-null object 9 DEPRESSION 918 non-null float64 10 PENTE 918 non-null object 11 CŒUR 918 non-null int64 dtypes: float64(1), int64(6), object(5) memory usage: 86.2+ KB
#Valeurs manquantes : Pas de valeur manquante
na_values=msno.matrix(data,figsize=(10,3))
na_values
<AxesSubplot:>
# Analyse univariée de densité et distribution de chaque variable
for col in data.select_dtypes('float64'):
plt.figure()
sns.distplot(data[col])
for col in data.select_dtypes('int64'):
plt.figure()
sns.distplot(data[col])
/home/celia/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). /home/celia/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). /home/celia/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). /home/celia/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). /home/celia/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). /home/celia/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). /home/celia/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
#Corrélation jeu de données
#Finding the correlation between variables
pearsonCorr = data.corr(method='pearson')
fig = plt.subplots(figsize=(14,8))
sns.heatmap(pearsonCorr, vmin=-1,vmax=1, cmap = "Greens", annot=True, linewidth=0.1)
plt.title("Pearson Correlation")
Text(0.5, 1.0, 'Pearson Correlation')
#Pearson corr masking
#Generating mask for upper triangle
maskP = np.triu(np.ones_like(pearsonCorr,dtype=bool))
#Adjust mask and correlation
maskP = maskP[1:,:-1]
pCorr = pearsonCorr.iloc[1:,:-1].copy()
#Setting up a diverging palette
cmap = sns.diverging_palette(0, 200, 150, 50, as_cmap=True)
fig = plt.subplots(figsize=(14,8))
sns.heatmap(pCorr, vmin=-1,vmax=1, cmap = cmap, annot=True, linewidth=0.3, mask=maskP)
plt.title("Pearson Correlation")
Text(0.5, 1.0, 'Pearson Correlation')
# Vérification de la relation entre les variables et le diagnostic de la maladie cardiaque
data['maladie'] = data['CŒUR'] .replace([0,1],['pas malade','malade'])
plt.figure(figsize=(25,18))
plt.subplot(2,2,1)
sns.violinplot(y=data.maladie,x=data.AGE)
plt.title('Les points dans le centre du violon montrent \n la médiane d''âge de chaque groupe')
plt.xlabel('Age')
plt.ylabel('Diagnostic maladie cardiaque')
plt.subplot(2,2,2)
sns.violinplot(y=data.maladie,x=data.PAR)
plt.title('Les points dans le centre du violon montrent \n la médiane de la pression artielle de chaque groupe')
plt.xlabel('Pression arterielle (mmHg)')
plt.ylabel('Diagnostic maladie cardiaque')
plt.subplot(2,2,3)
sns.violinplot(y=data.maladie,x=data.FCMAX)
plt.title('Les points dans le centre du violon montrent \n la médiane de la fréquence cardiaque maximale de chaque groupe')
plt.xlabel('fréquence cardiaque maximale (bpm)')
plt.ylabel('Diagnostic maladie cardiaque')
plt.subplot(2,2,4)
sns.violinplot(y=data.maladie,x=data['DEPRESSION '])
plt.title('Les points dans le centre du violon montrent \n la médiane de la dépression de chaque groupe')
plt.xlabel('Depression (mm)')
plt.ylabel('Diagnostic maladie cardiaque')
relation=plt.show()
print("Les résultats indiquent que les variables ont une distribution différente selon le diagnostic du patient. Cela indique qu'elles ont un impact sur un patient qui souffre d'une maladie cardiaque et ce sont de bonnes variables pour notre modèle.")
Les résultats indiquent que les variables ont une distribution différente selon le diagnostic du patient. Cela indique qu'elles ont un impact sur un patient qui souffre d'une maladie cardiaque et ce sont de bonnes variables pour notre modèle.
# Plot pair variable analysis
pair=sns.pairplot(data, hue='TDT')
pair
<seaborn.axisgrid.PairGrid at 0x7f041ffc7160>
#Personnes malades
is_sick = data[data.CŒUR == 1]
print("Le nombre de personne malade est : ", len(is_sick))
#Personne non malade
is_notsick = data[data.CŒUR == 0]
print("Le nombre de personne non-malade est : ",len(is_notsick))
#Pourcentage de personnes atteintes
perc_sick=round(508/(508+410)*100,2)
print("Dans notre jeu de données,",perc_sick, "% des personnes ont une maladie cardiaque")
#POurcentage femmes
nbwomen=data[data.SEXE == "femme"]
nbwomensick=len(nbwomen[nbwomen.CŒUR == 1])
pourc_womensick = (nbwomensick / len(nbwomen))*100
#POurcentage hommes
nbmen=data[data.SEXE == "homme"]
nbmensick=len(nbmen[nbmen.CŒUR == 1])
pourc_mensick = (nbmensick / len(nbmen))*100
Le nombre de personne malade est : 508 Le nombre de personne non-malade est : 410 Dans notre jeu de données, 55.34 % des personnes ont une maladie cardiaque
#Intro personnes malades
fig = plt.figure(figsize=(5, 5),dpi=150,
FigureClass=Waffle,
rows=8,
values=[len(data)/10, len(data['maladie']==1)/10],
colors=["coral", "mediumseagreen"],
characters='❤',
font_size=10,vertical=True)
fig.text(0.035,0.72,'Personnne affectées par une maladie cardiaque ',fontfamily='sanserif',fontsize=12,fontweight='bold')
fig.text(0.035,0.68,'Dans notre jeu de données, 55% des personnes ont une maladie cardiaque',fontfamily='monospace',fontsize=10)
plt.show()
# Ratio FEMME/HOMME de la maladie cardiaque
coeur_sexe = data.groupby(by=["SEXE", "CŒUR"]).count()[["ANGINE"]].rename(columns={"ANGINE":"Count"}).reset_index()
coeur_sexe['CŒUR'] = coeur_sexe['CŒUR'] .replace([0,1],['pas malade','malade'])
coeur_sexe['total'] = len(data)
#Graphique HOMME / FEMME
fig = px.sunburst(coeur_sexe, path=['total','SEXE',"CŒUR"], values='Count',color="SEXE",labels={
"0": "Homme",
"1": "Femme"
}, hover_data={"SEXE": True})
fig.layout.update(title_text="Ratio Femme/Homme par rapport à la maladie")
fig.show()
#Femmes malades
fig = plt.figure(figsize=(5, 5),dpi=150,
FigureClass=Waffle,
rows=9,
values=[len(nbwomen), nbwomensick],
colors=["coral", "mediumseagreen"],
characters='♀',
font_size=10,vertical=True,
)
fig.text(0.035,0.72,'Femmes affectées par une maladie cardiaque ',fontfamily='sanserif',fontsize=12,fontweight='bold')
fig.text(0.035,0.68,'Dans notre jeu de données, 25 % des femmes ont une maladie cardiaque',fontfamily='monospace',fontsize=10)
plt.show()
#Hommes malades
fig = plt.figure(figsize=(5, 5),dpi=150,
FigureClass=Waffle,
rows=12,
columns=20,
values=[len(nbmen), nbmensick],
colors=["coral", "mediumseagreen"],
characters='♂️',
font_size=10,vertical=True,
)
fig.text(0.035,0.85,'Hommes affectés par une maladie cardiaque ',fontfamily='sanserif',fontsize=12,fontweight='bold')
fig.text(0.035,0.81,'Dans notre jeu de données, 63 % des hommes ont une maladie cardiaque',fontfamily='monospace',fontsize=10)
plt.show()
# Données moyennes par genre des personnes non malades
notsickgenre = is_notsick.groupby(['SEXE']).mean().round(2)
notsickgenre.iloc[:, :-1]
| AGE | PAR | CHOLESTEROL | GAJ | FCMAX | DEPRESSION | |
|---|---|---|---|---|---|---|
| SEXE | ||||||
| femme | 51.2 | 128.79 | 247.44 | 0.07 | 149.05 | 0.44 |
| homme | 50.2 | 130.93 | 216.24 | 0.13 | 147.67 | 0.39 |
# Données moyennes par genre des personnes malades
sickgenre = is_sick.groupby(['SEXE']).mean().round(2)
sickgenre.iloc[:, :-1]
| AGE | PAR | CHOLESTEROL | GAJ | FCMAX | DEPRESSION | |
|---|---|---|---|---|---|---|
| SEXE | ||||||
| femme | 56.18 | 142.00 | 223.34 | 0.32 | 137.82 | 1.34 |
| homme | 55.87 | 133.33 | 170.77 | 0.34 | 126.55 | 1.27 |
#3/ Symptôme de la maladie cardiaque: TDT
df_cat_percent = data.groupby('maladie')['TDT'].value_counts(normalize=True).rename('pourcent').reset_index()
df_cat_percent['pourcent'] = np.round(df_cat_percent['pourcent']*100)
fig = px.bar(df_cat_percent, x='maladie',y='pourcent',color='TDT', barmode='group',
title='<b>Maladies cardiaques et Douleurs thoraciques</b>',
color_discrete_sequence=px.colors.qualitative.Pastel)
fig.show()
print("[AT : angine typique, AA : angine atypique, DNA : douleur non angineuse, ASY : asymptomatique]")
print("La proportion de patients atteints de maladies cardiaques asymptomatiques est bien plus élevée que ceux qui souffrent de certains types de douleurs thoraciques.")
[AT : angine typique, AA : angine atypique, DNA : douleur non angineuse, ASY : asymptomatique] La proportion de patients atteints de maladies cardiaques asymptomatiques est bien plus élevée que ceux qui souffrent de certains types de douleurs thoraciques.
#3/ Symptômes de la maladie cardiaque : Angine de poitrine
# Comparaison avec personnes non malades
isnot_sick = data[data.CŒUR == 0]
fig_notsick = px.scatter(isnot_sick, x="AGE", y="TDT", color="SEXE")
#fig_notsick.show()
fig_nosick2 = px.scatter(isnot_sick, x="AGE", y="ANGINE", size="AGE", color="SEXE")
#fig_nosick2.show()
fig_angine = make_subplots(rows=3, cols=1, subplot_titles=("Personnes malades","Personnes non-malades"))
fig_angine.append_trace(go.Scatter(x=is_sick['AGE'],y=is_sick['ANGINE'],mode='markers',), row=1, col=1)
fig_angine.append_trace(go.Scatter( x=isnot_sick['AGE'], y=isnot_sick['ANGINE'],mode='markers',), row=2, col=1)
fig_angine.update_layout(showlegend=False,height=600, width=600, title_text="Analyse Angine")
fig_angine.show()
#4/ Quelles sont les causes de la maladie ?
# Ensemble des facteurs aggravants les risques cardiaques : Cholestérol, glycémie, la dépression, et la pression arterielle
plot_PAR = px.violin(data, x=data['CŒUR'], y=data['PAR'], color=data['SEXE'],points="all")
plot_PAR.show()
print(' Les personnes ayant une pression artérielle plus élevée font principalement partie des patients atteints de maladies cardiaques.')
Les personnes ayant une pression artérielle plus élevée font principalement partie des patients atteints de maladies cardiaques.
plot_CHOL= px.violin(data, x=data['CŒUR'], y=data['CHOLESTEROL'], color=data['SEXE'],points="all")
plot_CHOL.show()
print('Les patients atteints de maladies cardiaques ont tendance à avoir un taux de cholestérol plus élevé que le groupe opposé.')
Les patients atteints de maladies cardiaques ont tendance à avoir un taux de cholestérol plus élevé que le groupe opposé.
plot_DEP= px.violin(data, x=data['CŒUR'], y=data['DEPRESSION '], color=data['SEXE'],points="all")
plot_DEP.show()
print('La depression peut être un symptôme et une cause aggravante de la maladie cardiaque.')
La depression peut être un symptôme et une cause aggravante de la maladie cardiaque.
#4/ Quelles sont les causes de la maladie ?
#Répartition des hommes et des femmes en fonction de la glycémie
fig = px.histogram(data,x='GAJ',color='maladie', barmode='group',
title='<b>Maladie cardiaque et glycémie</b>',
color_discrete_sequence=px.colors.qualitative.Safe)
fig.show()
print("Les personnes avec une glycémie <120 mg, ont un nombre plus élevé de maladies non cardiaques alors que les personnes avec une glycemie élevée presentes plus de maladies cardiaques ")
Les personnes avec une glycémie <120 mg, ont un nombre plus élevé de maladies non cardiaques alors que les personnes avec une glycemie élevée presentes plus de maladies cardiaques
# 5/ Résultats d'un test d'effort
#Analyse Frequence cardiaque max d'une personne malade ou non, en fonction de l'âge
fig_fcmax = px.scatter_3d(data, x="AGE",
y="maladie",
z='FCMAX',
color='SEXE',
size="CHOLESTEROL",title="Répartition des hommes & femmes cardiaques selon leur fréquence cardiaque maximale, et l'âge")
fig_fcmax.show()
print("Ici nous voyons la distribution homme et femme des personnes malades et non malades en fonction de la frequence cardiaque. Nous voyons que cette dernière est plus faible pour les personnes malades ")
Ici nous voyons la distribution homme et femme des personnes malades et non malades en fonction de la frequence cardiaque. Nous voyons que cette dernière est plus faible pour les personnes malades
#Age vs. Frequence cardiaque maximale pour la maladie cardiaque
# Create another figure
plt.figure(figsize=(10, 6))
# Scatter with positive examples
plt.scatter(data.AGE[data.CŒUR==1],
data.FCMAX[data.CŒUR==1],
c="salmon")
# Scatter with negative examples
plt.scatter(data.AGE[data.CŒUR==0],
data.FCMAX[data.CŒUR==0],
c="lightgreen")
# Add some helpful info
plt.title("Maladie cardiaque en fonction de l'âge et de la frequence cardiaque maximale")
plt.xlabel("Age")
plt.ylabel("Frequence cardiaque maximale")
plt.legend(["Malade", "Pas malade"]);
print(" Le graphique montre qu'un patient malade semble avoir une plus faible frequence cardiaque qu'un patient en bonne santé")
Le graphique montre qu'un patient malade semble avoir une plus faible frequence cardiaque qu'un patient en bonne santé
fig = px.scatter(data, x='AGE',y='FCMAX', color="SEXE",trendline='ols',
title='<b>Maladie cardiaque en fonction de l âge et de la frequence cardiaque maximale</b>',
color_discrete_sequence=px.colors.qualitative.Set2)
fig.show()
print('Plus un patient a de l âge plus la frequence cardiaque décline')
Plus un patient a de l âge plus la frequence cardiaque décline
fig_scattertroisd = px.scatter_3d(is_sick, x='AGE',
y='SEXE',
z='FCMAX',
size = 'CHOLESTEROL',
color='ANGINE',
title="Répartition des hommes & femmes cardiaques selon leur fréquence cardiaque maximale,l'âge et l'angine")
fig_scattertroisd.show()
#Analyse l'ECG d'une personne malade ou non
lastsick=is_sick.groupby(['SEXE','ECG']).count().reset_index()
fig_issick= px.funnel(lastsick, x='AGE', y='ECG', color='SEXE',title="Nombre de personne malade en fonction de l'ECG")
fig_issick.show()
lastnot=is_notsick.groupby(['SEXE','ECG']).count().reset_index()
fig_isnot= px.funnel(lastnot, x='AGE', y='ECG', color='SEXE',title="Nombre de personne non-malade en fonction de l'ECG")
fig_isnot.show()
df_cat_percent = data.groupby('ECG')['maladie'].value_counts(normalize=True).rename('percent').reset_index()
df_cat_percent['percent'] = np.round(df_cat_percent['percent']*100)
fig = px.bar(df_cat_percent.sort_values('maladie',ascending=False),
x='ECG',y='percent',color='maladie', barmode='stack',
title='<b>Maladie cardiaque : Analyse des résultats d éléctrocardiogramme ECG</b>')
fig.show()
print("Le groupe de personnes avec des résultats d'ECG normaux a un nombre plus élevé de maladies non cardiaques, tandis que les autres groupes montrent le contraire.")
Le groupe de personnes avec des résultats d'ECG normaux a un nombre plus élevé de maladies non cardiaques, tandis que les autres groupes montrent le contraire.
#Conclusion
plt.figure(figsize=(20, 10))
sns.countplot(x='AGE', hue='SEXE', data=is_sick,palette="Set2")
print('En conclusion, dans notre echantillon nous observons que c''est entre 45 et 65 ans que les personnes sont le plus touchés par la maladie cardiaque, en particulier les hommes')
En conclusion, dans notre echantillon nous observons que cest entre 45 et 65 ans que les personnes sont le plus touchés par la maladie cardiaque, en particulier les hommes
**Build a KNN classifier for predicting cardiac desease fo people based on some of their features.
Classifier is built following a classical Machine Learning workflow (for expert practitioners, the workflow should be more complete).
Explore data
Preprocess data : use only quantitative data
Build model
Evaluate model
Conclusions
#Pre processing
# Load module from scikit-learn
from sklearn.preprocessing import LabelEncoder
data.columns
Index(['AGE', 'SEXE', 'TDT', 'PAR', 'CHOLESTEROL', 'GAJ', 'ECG', 'FCMAX',
'ANGINE', 'DEPRESSION ', 'PENTE', 'CŒUR', 'maladie'],
dtype='object')
# Define features data (X) données quantitatives
X = data[['AGE', 'PAR', 'CHOLESTEROL', 'GAJ', 'FCMAX', 'DEPRESSION ']]
# Define features data (X) données qualitatives
Xqual= data[['SEXE', 'TDT', 'ECG', 'ANGINE', 'PENTE']]
Xqual
| SEXE | TDT | ECG | ANGINE | PENTE | |
|---|---|---|---|---|---|
| 0 | homme | AA | Normal | Non | Ascendant |
| 1 | femme | DNA | Normal | Non | Plat |
| 2 | homme | AA | ST | Non | Ascendant |
| 3 | femme | ASY | Normal | Oui | Plat |
| 4 | homme | DNA | Normal | Non | Ascendant |
| ... | ... | ... | ... | ... | ... |
| 913 | homme | AT | Normal | Non | Plat |
| 914 | homme | ASY | Normal | Non | Plat |
| 915 | homme | ASY | Normal | Oui | Plat |
| 916 | femme | AA | LVH | Non | Plat |
| 917 | homme | DNA | Normal | Non | Ascendant |
918 rows × 5 columns
# Define target data (y)
y = data['CŒUR']
y
0 0
1 1
2 0
3 1
4 0
..
913 1
914 1
915 1
916 1
917 0
Name: CŒUR, Length: 918, dtype: int64
# Initialise the encoder
label_encod = LabelEncoder() #Classe LabelEncoder
# Apply encoder on target data
y = label_encod.fit_transform(y) # Méthode fit_transform qui est la dans Classe LabelENcoder()
#je fais ajustement et la transformation des données
y
array([0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1,
0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1,
0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0])
**Train-test-split
# Load module from scikit-learn
from sklearn.model_selection import train_test_split
# Split data into 2 parts : train & test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) #la clé permet de generer les nombre permet accuracy du model = 100% mais si je mets 20 accurcy du model = 92%
# Check the size of subset of data
print("The length of the initial dataset is :", len(X))
print("The length of the train dataset is :", len(X_train))
print("The length of the test dataset is :", len(X_test))
The length of the initial dataset is : 918 The length of the train dataset is : 688 The length of the test dataset is : 230
# Visualise X_train data
X_train
| AGE | PAR | CHOLESTEROL | GAJ | FCMAX | DEPRESSION | |
|---|---|---|---|---|---|---|
| 155 | 56 | 155 | 342 | 1 | 150 | 3.0 |
| 362 | 56 | 155 | 0 | 0 | 99 | 0.0 |
| 869 | 59 | 150 | 212 | 1 | 157 | 1.6 |
| 101 | 51 | 130 | 179 | 0 | 100 | 0.0 |
| 199 | 57 | 130 | 308 | 0 | 98 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... |
| 106 | 48 | 120 | 254 | 0 | 110 | 0.0 |
| 270 | 45 | 120 | 225 | 0 | 140 | 0.0 |
| 860 | 60 | 130 | 253 | 0 | 144 | 1.4 |
| 435 | 60 | 152 | 0 | 0 | 118 | 0.0 |
| 102 | 40 | 150 | 392 | 0 | 130 | 2.0 |
688 rows × 6 columns
# Visualise y_train data
y_train
array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0,
1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0,
0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0,
0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0,
0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0,
1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1,
0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0,
1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0,
0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1,
1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0,
0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,
1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0,
0, 0, 0, 1, 0, 1])
**Build Model
We are going to build a Classifier model by appying :
Initialisation step,
Fitting step,
Prediction.
**Test KNN
# Load module from scikit-learn
from sklearn.neighbors import KNeighborsClassifier
# Initialisation with the choice of k = 3 : Choisi à la main
#KNN_classifier = KNeighborsClassifier(n_neighbors=5) # 5 plus proche voisins , ne doit pas etre pair
#Checking for the best value of k: Choix intelligent
error = []
# Calculating error for K values between 1 and 30
for i in range(1, 30):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
pred_i = knn.predict(X_test)
error.append(np.mean(pred_i != y_test))
plt.figure(figsize=(12, 6))
plt.plot(range(1, 30), error, color='red', linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')
print("Minimum error:-",min(error),"at K =",error.index(min(error))+1)
Minimum error:- 0.2826086956521739 at K = 5
#Apply K-NN Algorithm:
classifier= KNeighborsClassifier(n_neighbors=5)
# Let see the initialised model
KNN_classifier
KNeighborsClassifier()
# Fitting the model
KNN_classifier.fit(X_train, y_train)
KNeighborsClassifier()
# Predicting
prediction = KNN_classifier.predict(X_test)
prediction
array([0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0,
1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1,
0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1,
0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
0, 1, 1, 0, 1, 1, 1, 1, 0, 0])
y_test
array([0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0,
0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0,
1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1,
1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0,
1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1,
0, 1, 1, 0, 1, 0, 1, 1, 0, 0])
**Evaluate
# Load module from scikit-learn
from sklearn.metrics import confusion_matrix, accuracy_score
# Compute the accuracy (in %)
accuracy = accuracy_score(y_test, prediction)*100
# Let's see the global performance
print('Accuracy of our model is equal ' + str(round(accuracy, 2)) + ' %.')
Accuracy of our model is equal 71.74 %.
# Compute the matrix of confusion
confusion_matrix(y_test, prediction)
array([[69, 29],
[36, 96]])
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(KNN_classifier, X_test, y_test,
cmap=plt.cm.summer, # other color palettes : https://matplotlib.org/3.1.0/tutorials/colors/colormaps.html
normalize=None)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f041db77790>
**Essaie KNN avec Standard SCALER sur données quantitatives
# With Scaler
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
X_stdscl = StandardScaler().fit_transform(X)
X_stdscl
array([[-1.4331398 , 0.41090889, 0.82507026, -0.55134134, 1.38292822,
-0.83243239],
[-0.47848359, 1.49175234, -0.17196105, -0.55134134, 0.75415714,
0.10566353],
[-1.75135854, -0.12951283, 0.7701878 , -0.55134134, -1.52513802,
-0.83243239],
...,
[ 0.37009972, -0.12951283, -0.62016778, -0.55134134, -0.85706875,
0.29328271],
[ 0.37009972, -0.12951283, 0.34027522, -0.55134134, 1.4615246 ,
-0.83243239],
[-1.64528563, 0.30282455, -0.21769643, -0.55134134, 1.42222641,
-0.83243239]])
# Split data into 2 parts : train & test
X_trains, X_tests, y_trains, y_tests = train_test_split(X_stdscl, y, test_size=0.25, random_state=42) #la clé permet de generer les nombre permet accuracy du model = 100% mais si je mets 20 accurcy du model = 92%
# Initialisation with the choice of k = 3
KNN_classifier_sc = KNeighborsClassifier(n_neighbors=5)
KNN_classifier_sc
KNeighborsClassifier()
# Fitting the model
KNN_classifier_sc.fit(X_trains, y_trains)
KNeighborsClassifier()
# Check the size of subset of data
print("The length of the initial dataset is :", len(X))
print("The length of the train dataset is :", len(X_trains))
print("The length of the test dataset is :", len(X_tests))
The length of the initial dataset is : 918 The length of the train dataset is : 688 The length of the test dataset is : 230
prediction_scale= KNN_classifier_sc.predict(X_tests)
accuracy = accuracy_score(y_tests, prediction_scale)*100
accuracy
76.52173913043478
**Essaie KNN avec Ordinal encoder sur données qualitatives et Scaler sur données quantitatives
# Essaie Ordinal encoder
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit(Xqual)
OrdinalEncoder()
enc.categories_
[array(['femme', 'homme'], dtype=object), array(['AA', 'ASY', 'AT', 'DNA'], dtype=object), array(['LVH', 'Normal', 'ST'], dtype=object), array(['Non', 'Oui'], dtype=object), array(['Ascendant', 'Descendant', 'Plat'], dtype=object)]
#Transformation nombre des catégories
Xqualité=enc.fit_transform(Xqual)
Xqualité
array([[1., 0., 1., 0., 0.],
[0., 3., 1., 0., 2.],
[1., 0., 2., 0., 0.],
...,
[1., 1., 1., 1., 2.],
[0., 0., 0., 0., 2.],
[1., 3., 1., 0., 0.]])
#Concatenation des données quantitatives scalées et qualitatives encodées
C = np.concatenate((X_stdscl,Xqualité), axis=1)
C
array([[-1.4331398 , 0.41090889, 0.82507026, ..., 1. ,
0. , 0. ],
[-0.47848359, 1.49175234, -0.17196105, ..., 1. ,
0. , 2. ],
[-1.75135854, -0.12951283, 0.7701878 , ..., 2. ,
0. , 0. ],
...,
[ 0.37009972, -0.12951283, -0.62016778, ..., 1. ,
1. , 2. ],
[ 0.37009972, -0.12951283, 0.34027522, ..., 0. ,
0. , 2. ],
[-1.64528563, 0.30282455, -0.21769643, ..., 1. ,
0. , 0. ]])
# Split data into 2 parts : train & test
X_trains, X_tests, y_trains, y_tests = train_test_split(C, y, test_size=0.25, random_state=42) #la clé permet de generer les nombre permet accuracy du model = 100% mais si je mets 20 accurcy du model = 92%
# Initialisation with the choice of k = 3
KNN_classifier_sc = KNeighborsClassifier(n_neighbors=5)
# Fitting the model
KNN_classifier_sc.fit(X_trains, y_trains)
KNeighborsClassifier()
KNN_classifier_sc
KNeighborsClassifier()
prediction_scale= KNN_classifier_sc.predict(X_tests)
accuracy = accuracy_score(y_tests, prediction_scale)*100
accuracy
86.08695652173914
**Decision tree avec données scalées et encodées
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(X_trains, y_trains)
LogisticRegression(max_iter=1000)
print('DecisionTreeClassifier')
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(max_depth=5)
decision_tree.fit(X_trains, y_trains)
Y_pred = model.predict(X_tests)
score = model.score(X_trains, y_trains)
print('Training Score:', score)
score = model.score(X_tests, y_tests)
print('Testing Score:', score)
output = pd.DataFrame({'Predicted':Y_pred}) # Heart-Disease yes or no? 1/0
print(output.head())
people = output.loc[output.Predicted == 1]["Predicted"]
rate_people = 0
if len(people) > 0 :
rate_people = len(people)/len(output)
print("% of people predicted with heart-disease:", rate_people)
score_dtc = score
out_dtc = output
DecisionTreeClassifier Training Score: 0.8561046511627907 Testing Score: 0.8304347826086956 Predicted 0 0 1 0 2 1 3 1 4 0 % of people predicted with heart-disease: 0.5260869565217391
from sklearn.metrics import classification_report
print(classification_report(y_tests,Y_pred))
precision recall f1-score support
0 0.77 0.86 0.81 98
1 0.88 0.81 0.85 132
accuracy 0.83 230
macro avg 0.83 0.83 0.83 230
weighted avg 0.84 0.83 0.83 230
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_tests,Y_pred)
class_names = [0,1]
fig,ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)
sns.heatmap(pd.DataFrame(confusion_matrix), annot = True, cmap = 'Greens', fmt = 'g')
ax.xaxis.set_label_position('top')
plt.tight_layout()
plt.title('Confusion matrix for decision tree')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()
**Random forest
names=list(data.columns)
# Using Random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_trains,y_trains)
y_pred_rf = rf.predict(X_tests)
print("Accuracy of Random Forest Classifier :: ", metrics.accuracy_score(y_tests, y_pred_rf))
#Find the score of each feature in model and drop the features with low scores
f_imp = rf.feature_importances_
for i,v in enumerate(f_imp):
print('Feature: %s, Score: %.5f' % (names[i],v))
Accuracy of Random Forest Classifier :: 0.8826086956521739 Feature: AGE, Score: 0.08914 Feature: SEXE, Score: 0.06943 Feature: TDT, Score: 0.11275 Feature: PAR, Score: 0.02087 Feature: CHOLESTEROL, Score: 0.10893 Feature: GAJ, Score: 0.12505 Feature: ECG, Score: 0.03500 Feature: FCMAX, Score: 0.07216 Feature: ANGINE, Score: 0.02479 Feature: DEPRESSION , Score: 0.11182 Feature: PENTE, Score: 0.23006